library(readxl)
library(dplyr)
library(tidyr)
library(stringr)

library(tidyverse)
library(DT)
library(patchwork)
library(ggthemes)
library(plotly)
library(gapminder)
library(grid)
library(ggplot2)
library(gridExtra)

library(maps)
library(usmap)

1 Title

Exploring the Impact of COVID-19 on State-Level Economic Indicators and housing price in the U.S

1.1 Introduction

This project is to explore how the COVID-19 affected economic conditions especially housing price across U.S. states. Although the final goal is to analyze which factors contributed to post-COVID changes in housing prices, it also contains essential works: collecting, cleaning, visualizing and statistical analysis.

1.2 Data Soueces

  • COVID

  • Zillow Home Value Index (ZHVI)

A measure of the typical home value and market changes across a given region and housing type. It reflects the typical value for homes in the 35th to 65th percentile range. Available as a smoothed, seasonally adjusted measure and as a raw measure.

Source: Zillow Research (https://www.zillow.com/research/data/)

File name: State_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv

  • State unemployment rates over the last 10 years, seasonally adjusted

This dataset shows monthly unemployment rates by U.S. state. It is part of the Local Area Unemployment Statistics (LAUS) program by the Bureau of Labor Statistics (BLS). The data is seasonally adjusted and helps track labor market trends at the state level over time.

Source :U.S. Bureau of Labor Statistics (https://www.bls.gov/charts/state-employment-and-unemployment/state-unemployment-rates-animated.htm)

File name: unemployment rate.xlxs

  • Personal Consumption Expenditures (PCE)

Reports annual state-level PCE data, measuring average household spending across different regions.

Source: Bureau of Economic Analysis (BEA) (https://apps.bea.gov/regional/downloadzip.htm)

File name: SAPCE1__ALL_AREAS_1997_2023.csv

  • Annual Personal Income

Includes annual personal income per capita by state. Useful for assessing economic status and regional income disparities. Source: Bureau of Economic Analysis (BEA) File name: SAINC1__ALL_AREAS_1929_2024.csv → INCOME.csv

  • Gross Domestic Product (GDP)

Provides real GDP data by state from 1997 to 2024. This helps analyze economic output and growth across regions.

Source: Bureau of Economic Analysis (BEA)

File name: SAGDP1__ALL_AREAS_1997_2024.csv

  • State Level population and Household Debt Statistics

Includes per capita credit card debt and total household debt across states. Used to assess consumer financial health.

Sheet Title
population Number of Consumers in New York Fed Consumer Credit Panel
auto loan Auto Debt Balance per Capita
credit card Credit Card Debt Balance per Capita
mortgage Mortgage Debt Balance per Capita (excluding HELOC)
student loan Student Loan Debt Balance per Capita
total Total Debt Balance per Capita
auto loan delinquency Percent of Auto Debt Balance 90+ Days Delinquent
credit card delinquency Percent of Credit Card Debt Balance 90+ Days Delinquent
mortgage delinquency Percent of Mortgage Debt Balance 90+ Days Delinquent
student loan delinquency Percent of Student Loan Debt Balance 90+ Days Delinquent (and in default)

Source: New York Fed (https://www.newyorkfed.org/microeconomics/databank.html) File name: area_report_by_year.xlsx → NYFED.csv

2 DATA CLEANING

2.1 COVID case for US

covid <- read_csv("data/cases_deaths.csv")
#View(covid)

covid_US <- covid %>% 
  filter(country == "United States") %>% 
  select(country, date, new_cases, total_cases, weekly_cases) %>%
  mutate(date = as.Date(date)) %>%
  mutate(year = year(date), month = month(date, label = TRUE, abbr = FALSE)) %>%
  group_by(year, month) %>%
  summarise(
    monthly_new_cases = sum(new_cases, na.rm = TRUE),
    latest_total_cases = max(total_cases, na.rm = TRUE),
    total_weekly_cases = sum(weekly_cases, na.rm = TRUE),
    .groups = "drop"
  )

covid_US
## # A tibble: 145 × 5
##     year month     monthly_new_cases latest_total_cases total_weekly_cases
##    <dbl> <ord>                 <dbl>              <dbl>              <dbl>
##  1     1 January              820465          103436829            4160575
##  2     1 February             701457          103436829            4418032
##  3     1 March                531503          103436829            4672369
##  4     1 April                616176          103436829            4930142
##  5     1 May                 1107849          103436829            5419154
##  6     1 June                1378542          103436829            5839205
##  7     1 July                 945231          103436829            6101223
##  8     1 August              1056186          103436829            6336944
##  9     1 September           1122716          103436829            6758203
## 10     1 October              740848          103436829            6967548
## # ℹ 135 more rows

2.2 COVID Cases per State

covidbystate <-read_csv("data/covid_confirmed_usafacts.csv")

covidStatesDaily <- covidbystate %>% 
  group_by(State) %>% 
  summarize(across(where(is.numeric), sum, na.rm = TRUE))
#covidStatesDaily

covidstates <- covidStatesDaily %>% 
  pivot_longer(
    cols = -State,
    names_to = "date",
    values_to = "cases"
  ) %>%
  filter(str_detect(date, "^\\d{4}-\\d{2}-\\d{2}$")) %>%
  mutate(
    date = ymd(date),
    year = year(date)
  ) %>%
  group_by(State, year) %>%
  summarise(total_cases = max(cases, na.rm = TRUE), .groups = "drop")

#covidstates
case_summary <- summary(covidstates$total_cases)
#case_summary

covidstates <- covidstates %>% 
  mutate(
    severity = case_when(
      total_cases <= case_summary[1] ~ "Very low",    
      total_cases <= case_summary[2] ~ "Low", 
      total_cases <= case_summary[3] ~ "Medium",    
      total_cases <= case_summary[5] ~ "High", 
      TRUE ~ "Very high"                              
    )
  )
covidstates <- covidstates %>%
  mutate(severity = factor(severity, levels = c("Very low", "Low", "Medium", "High", "Very high")))
covidstates
## # A tibble: 204 × 4
##    State  year total_cases severity
##    <chr> <dbl>       <dbl> <fct>   
##  1 AK     2020       46304 Low     
##  2 AK     2021      149907 Low     
##  3 AK     2022      277884 Low     
##  4 AK     2023      287319 Low     
##  5 AL     2020      361226 Medium  
##  6 AL     2021      896614 High    
##  7 AL     2022     1568934 High    
##  8 AL     2023     1659936 High    
##  9 AR     2020      225138 Low     
## 10 AR     2021      562455 Medium  
## # ℹ 194 more rows

2.3 ZHVI

house <- read_csv("data/State_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")

house1 <- house %>%
  rename(State = RegionName) %>%
  select(-StateName, -RegionID, -SizeRank, -RegionType)
house_long <- house1 %>%
  pivot_longer(cols = -State, names_to = "Date", values_to = "ZHVI")
#house_long

house_long1 <- house_long %>%
  filter(str_sub(Date, 1, 4) %in% c("2017", "2018", "2019", "2020", "2021", "2022"))

#house_long1

house_long2 <- house_long1 %>%
  mutate(year = as.numeric(str_sub(Date, 1, 4)))

house_avg <- house_long2 %>%
  group_by(State, year) %>%
  summarise(mean_zhvi = mean(ZHVI, na.rm = TRUE), .groups = "drop")

#house_avg

house2 <- house_avg %>%
  pivot_wider(names_from = year, values_from = mean_zhvi)

#house2
house3 <- house2 %>%
  mutate(
    zhvi_pre_avg = rowMeans(select(., `2017`, `2018`, `2019`), na.rm = TRUE),
    zhvi_post_avg = rowMeans(select(., `2020`, `2021`, `2022`), na.rm = TRUE),
    zhvi_pre_change = ((`2019` - `2017`) / `2017`)*100,
    zhvi_post_change = ((`2022` - `2020`) / `2020`)*100
  ) %>%
  select(State, zhvi_pre_avg, zhvi_post_avg, zhvi_pre_change, zhvi_post_change)

#house3
state_codes <- read.csv("https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv")

house4 <- house3 %>%
  left_join(state_codes, by = c("State" = "State"))

house_final <- house4  %>%
  select(Abbreviation, zhvi_pre_avg, zhvi_post_avg, zhvi_pre_change, zhvi_post_change) %>%
  rename(State = Abbreviation)

house_final 
## # A tibble: 51 × 5
##    State zhvi_pre_avg zhvi_post_avg zhvi_pre_change zhvi_post_change
##    <chr>        <dbl>         <dbl>           <dbl>            <dbl>
##  1 AL         148993.       192427.            9.76            28.0 
##  2 AK         292351.       335028.            7.66            12.3 
##  3 AZ         252522.       368418.           15.0             49.4 
##  4 AR         140966.       175352.            7.83            26.1 
##  5 CA         521016.       663241.           12.7             31.3 
##  6 CO         375678.       486205.           12.6             32.8 
##  7 CT         251600.       305414.            3.63            28.0 
##  8 DE         262137.       318745.            5.58            27.9 
##  9 DC         560797.       633072.            8.25             7.84
## 10 FL         228291.       308956.           13.2             45.6 
## # ℹ 41 more rows
# export to file but I will # because we already have sync folder with github
#write.csv(house_final, "data/data-clean/HOUSING.csv", row.names = FALSE)

2.4 Unemployment Rates

unemploy <- read_excel("data/unemployment rate.xlsx", range = "A1:DR53")
names(unemploy)[-1] <- as.character(as.Date(as.numeric(names(unemploy)[-1]), origin = "1899-12-30"))

pre_df <- unemploy %>%
  select(State, `2017-12-01`, `2019-12-01`) %>%
  # Some is chr so we need to numeric
  mutate(across(-State, ~ as.numeric(.))) %>%
  mutate(unemp_pre = `2019-12-01` - `2017-12-01`) %>%
  rename(unemp_19 = `2019-12-01`, unemp_17=`2017-12-01`) %>% 
  select(State, unemp_pre, unemp_17 ,unemp_19)

post_df <- unemploy %>%
  select(State, `2020-12-01`, `2022-12-01`) %>%
  # Some is chr so we need to numeric
  mutate(across(-State, ~ as.numeric(.))) %>%
  mutate(unemp_post = `2022-12-01` - `2020-12-01`) %>%
  rename(unemp_22 = `2022-12-01`, unemp_20 = `2020-12-01`) %>% 
  select(State, unemp_post, unemp_20, unemp_22)

unemploy2 <- pre_df %>%
  inner_join(post_df, by = "State")

unemploy3 <- unemploy2 %>%
  left_join(state_codes, by = c("State" = "State"))

unemploy_final <- unemploy3 %>% 
  select(Abbreviation, unemp_pre,unemp_17, unemp_19, unemp_post,unemp_20, unemp_22) %>% 
  rename(State = Abbreviation)

unemploy_final
## # A tibble: 52 × 7
##    State unemp_pre unemp_17 unemp_19 unemp_post unemp_20 unemp_22
##    <chr>     <dbl>    <dbl>    <dbl>      <dbl>    <dbl>    <dbl>
##  1 AL       -0.9        4        3.1       -2.2      4.5      2.3
##  2 AK       -1.4        6.6      5.2       -3.1      7.1      4  
##  3 AZ       -0.200      4.9      4.7       -2.8      6.5      3.7
##  4 AR       -0.300      3.8      3.5       -2        5        3  
##  5 CA       -0.400      4.5      4.1       -4.6      9        4.4
##  6 CO       -0.100      2.9      2.8       -3.4      6.4      3  
##  7 CT       -0.600      4.3      3.7       -3.9      7.4      3.5
##  8 DE       -0.6        4.2      3.6       -1.6      5.6      4  
##  9 DC       -0.5        5.9      5.4       -3.1      7.8      4.7
## 10 FL       -1          4        3         -3.3      6.2      2.9
## # ℹ 42 more rows
# export to file but I will # because we already have sync folder with github
#write.csv(unemploy_final, "data/data-clean/UNEMPLOY.csv", row.names = FALSE)

2.5 PCE

pce <- read.csv("data/SAPCE1__ALL_AREAS_1997_2023.csv")

pce_df <- pce %>%
  filter(Description == "Personal consumption expenditures ") %>%
  select(GeoName, X2017, X2019, X2020, X2022) %>%
  rename(state = GeoName) %>%
  filter(state != "United States") %>%
  mutate(
    pce_pre = ((X2019 - X2017) / X2017) * 100,
    pce_post = ((X2022 - X2020) / X2020) * 100
  )

pce_final <- pce_df %>%
  left_join(state_codes, by = c("state" = "State")) %>%
  select(Abbreviation, pce_pre, pce_post) %>%
  rename(State = Abbreviation)

head(pce_final)
##   State   pce_pre pce_post
## 1    AL  8.277837 22.66264
## 2    AK  5.572225 23.88433
## 3    AZ 10.681811 27.98044
## 4    AR  6.486729 22.09850
## 5    CA 10.787929 25.65983
## 6    CO 11.319759 27.96054
# export to file but I will # because we already have sync folder with github
#write.csv(pce_final, "data/data-clean/PCE.csv", row.names = FALSE)

2.6 Annual Personal Income

income <- read.csv("data/SAINC1__ALL_AREAS_1929_2024.csv")

income_df1 <- income %>%
  filter(str_trim(Description) == "Per capita personal income (dollars) 2/") %>%
  select(GeoName, X2017, X2019, X2020, X2022) %>%
  rename(state = GeoName) %>%
  filter(state != "United States") %>%
  mutate(
    income_pre = ((X2019 - X2017) / X2017) * 100,
    income_post = ((X2022 - X2020) / X2020) * 100
  )

income_final <- income_df1 %>%
  left_join(state_codes, by = c("state" = "State")) %>%
  select(Abbreviation, income_pre, income_post) %>%
  rename(State = Abbreviation)

head(income_final)
##   State income_pre income_post
## 1    AL   7.581065   12.688960
## 2    AK   7.086726   11.401225
## 3    AZ  10.317882   13.134632
## 4    AR   5.637273   17.407643
## 5    CA  10.315388    9.528295
## 6    CO  13.109126   18.359079
# export to file but I will # because we already have sync folder with github
#write.csv(income_final, "data/data-clean/INCOME.csv", row.names = FALSE)

2.7 GDP

gdp_df <- read.csv("data/SAGDP1__ALL_AREAS_1997_2024.csv")

gdp_real <- gdp_df %>%
  filter(Description == "Real GDP (millions of chained 2017 dollars) 1/")

gdp_df2 <- gdp_real %>%
  select(GeoName, X2017, X2019, X2020, X2022) %>%
  rename(state = GeoName) %>%
  mutate(
    gdp_pre = ((X2019 - X2017) / X2017) * 100,
    gdp_post = ((X2022 - X2020) / X2020) * 100
  ) %>%
  filter(state != "United States")

gdp_final <- gdp_df2 %>%
  left_join(state_codes, by = c("state" = "State")) %>%
  select(Abbreviation, gdp_pre, gdp_post) %>%
  rename(State = Abbreviation)

head(gdp_final)
##   State   gdp_pre   gdp_post
## 1    AL  3.996621  7.3182724
## 2    AK -2.191186  0.8628568
## 3    AZ  7.948898 12.3828137
## 4    AR  2.694002  8.7421859
## 5    CA  8.358128  8.5462064
## 6    CO  9.570425 10.1607412
# export to file but I will # because we already have sync folder with github
#write.csv(gdp_final, "data/data-clean/GDP.csv", row.names = FALSE)

2.8 Population & Debt Statistics

file_path <- "data/area_report_by_year.xlsx"

# There are many sheets so we should target it
# But auto has diffefent format so we will seperate it
target_sheets <- c(
"population",
"creditcard",
"mortgage",
"student loan",
"total",
"auto_delinq",
"creditcard_delinq",
"mortgage_delinq",
"studentloan_delinq"
)

# auto has different format so seperated
auto <- read_excel(file_path, sheet = "auto", skip = 4, col_names = TRUE)
#auto
for (sheet_name in target_sheets) {
  
  df_name <- gsub(" ", "_", sheet_name)
  
  # The real data start from 9 row in the xlxs file. 
  df <- read_excel(file_path, sheet = sheet_name, skip = 8, col_names = TRUE)
  
  assign(df_name, df)
  #print(df_name)
}

# After load the file, it's nice to check but I will make it # to make the document short.
#population
#auto
#creditcard
#mortgage
#student_loan
#total
#auto_delinq
#creditcard_delinq
#mortgage_delinq
#studentloan_delinq
#population
population_clean <- population %>%
  filter(!is.na(state)) %>%
  mutate(
    population_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
    population_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
  ) %>%
  select(state, population_pre, population_post)

# population_clean
creditcard_clean <- creditcard %>%
  filter(!is.na(state)) %>%
  mutate(
    creditcard_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
    creditcard_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
  ) %>%
  select(state, creditcard_pre, creditcard_post)

# creditcard_clean
auto_clean <- auto %>%
  filter(!is.na(state)) %>%
  mutate(
    auto_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
    auto_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
  ) %>%
  select(state, auto_pre, auto_post)

# auto_clean
mortgage_clean <- mortgage %>%
  filter(!is.na(state)) %>%
  mutate(
    mortgage_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
    mortgage_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
  ) %>%
  select(state, mortgage_pre, mortgage_post)

# mortgage_clean
studentloan_clean <- student_loan %>%
  filter(!is.na(state)) %>%
  mutate(
    studentloan_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
    studentloan_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
  ) %>%
  select(state, studentloan_pre, studentloan_post)

# studentloan_clean
total_clean <- total %>%
  filter(!is.na(state)) %>%
  mutate(
    total_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
    total_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
  ) %>%
  select(state, total_pre, total_post)

# total_clean 
creditcard_delinq_clean <- creditcard_delinq %>%
  filter(!is.na(state)) %>%
  select(state, Q4_2019, Q4_2022) %>%
  rename(
    creditcard_delinq_pre = Q4_2019,
    creditcard_delinq_post = Q4_2022
  )

# creditcard_delinq_clean
auto_delinq_clean <- auto_delinq %>%
  filter(!is.na(state)) %>%
  select(state, Q4_2019, Q4_2022) %>%
  rename(
    auto_delinq_pre = Q4_2019,
    auto_delinq_post = Q4_2022
  )

# auto_delinq_clean
mortgage_delinq_clean <- mortgage_delinq %>%
  filter(!is.na(state)) %>%
  select(state, Q4_2019, Q4_2022) %>%
  rename(
    mortgage_delinq_pre = Q4_2019,
    mortgage_delinq_post = Q4_2022
  )

# mortgage_delinq_clean
studentloan_delinq_clean <- studentloan_delinq %>%
  filter(!is.na(state)) %>%
  select(state, Q4_2019, Q4_2022) %>%
  rename(
    studentloan_delinq_pre = Q4_2019,
    studentloan_delinq_post = Q4_2022
  )

# studentloan_delinq_clean
NYFED_df <- population_clean %>%
  left_join(creditcard_clean, by = "state") %>%
  left_join(auto_clean, by = "state") %>%
  left_join(mortgage_clean, by = "state") %>%
  left_join(studentloan_clean, by = "state") %>%
  left_join(total_clean, by = "state") %>%
  left_join(creditcard_delinq_clean, by = "state") %>%
  left_join(auto_delinq_clean, by = "state") %>%
  left_join(mortgage_delinq_clean, by = "state") %>%
  left_join(studentloan_delinq_clean, by = "state")

head(NYFED_df)
## # A tibble: 6 × 21
##   state population_pre population_post creditcard_pre creditcard_post auto_pre
##   <chr>          <dbl>           <dbl>          <dbl>           <dbl>    <dbl>
## 1 AK            -0.444           -1.15           3.98            13.6   -0.998
## 2 AL             1.58             1.22           7.59            15.5    7.74 
## 3 AR             1.53             2.31          10               16.1    5.34 
## 4 AZ             4.47             4.48          10.2             16.5    9.41 
## 5 CA             1.57             1.87          11.4             19.8    7.53 
## 6 CO             3.31             3.61           8.22            17.6    4.75 
## # ℹ 15 more variables: auto_post <dbl>, mortgage_pre <dbl>,
## #   mortgage_post <dbl>, studentloan_pre <dbl>, studentloan_post <dbl>,
## #   total_pre <dbl>, total_post <dbl>, creditcard_delinq_pre <dbl>,
## #   creditcard_delinq_post <dbl>, auto_delinq_pre <dbl>,
## #   auto_delinq_post <dbl>, mortgage_delinq_pre <dbl>,
## #   mortgage_delinq_post <dbl>, studentloan_delinq_pre <dbl>,
## #   studentloan_delinq_post <dbl>
# Export final result if needed
#write.csv(NYFED_df, "data/data-clean/NYFED.csv", row.names = FALSE)

2.9 Final Data Join

clean_gdp <- read_csv("data/data-clean/GDP.csv")
clean_income <- read_csv("data/data-clean/INCOME.csv")
clean_nyfed <- read_csv("data/data-clean/NYFED.csv")
clean_pce <- read_csv("data/data-clean/PCE.csv")
clean_unemploy <- read_csv("data/data-clean/UNEMPLOY.csv")
clean_housing<- read_csv("data/data-clean/HOUSING.csv")

#clean_gdp 
#clean_income
#clean_nyfed 
#clean_pce 
#clean_unemploy 

clean_nyfed  <- clean_nyfed  %>%
  rename(State = state)

# Find common states
common_states <- Reduce(intersect, list(
  clean_gdp$State,
  clean_income$State,
  clean_pce$State,
  clean_nyfed$State,
  clean_unemploy$State,
  clean_housing$State
))

# Apply only common state
clean_gdp <- clean_gdp %>% filter(State %in% common_states)
clean_income <- clean_income %>% filter(State %in% common_states)
clean_pce <- clean_pce %>% filter(State %in% common_states)
clean_nyfed <- clean_nyfed %>% filter(State %in% common_states)
clean_unemploy <- clean_unemploy %>% filter(State %in% common_states)
clean_housing <- clean_housing %>% filter(State %in% common_states)

# Merge to one data set
merged_df <- clean_housing %>%
  left_join(clean_gdp, by = "State") %>%
  left_join(clean_income, by = "State") %>%
  left_join(clean_pce, by = "State") %>%
  left_join(clean_nyfed, by = "State") %>%
  left_join(clean_unemploy, by = "State")

merged_df
## # A tibble: 51 × 37
##    State zhvi_pre_avg zhvi_post_avg zhvi_pre_change zhvi_post_change gdp_pre
##    <chr>        <dbl>         <dbl>           <dbl>            <dbl>   <dbl>
##  1 AL         148993.       192427.            9.76            28.0    4.00 
##  2 AK         292351.       335028.            7.66            12.3   -2.19 
##  3 AZ         252522.       368418.           15.0             49.4    7.95 
##  4 AR         140966.       175352.            7.83            26.1    2.69 
##  5 CA         521016.       663241.           12.7             31.3    8.36 
##  6 CO         375678.       486205.           12.6             32.8    9.57 
##  7 CT         251600.       305414.            3.63            28.0    0.493
##  8 DE         262137.       318745.            5.58            27.9    7.29 
##  9 DC         560797.       633072.            8.25             7.84   3.43 
## 10 FL         228291.       308956.           13.2             45.6    6.90 
## # ℹ 41 more rows
## # ℹ 31 more variables: gdp_post <dbl>, income_pre <dbl>, income_post <dbl>,
## #   pce_pre <dbl>, pce_post <dbl>, population_pre <dbl>, population_post <dbl>,
## #   creditcard_pre <dbl>, creditcard_post <dbl>, auto_pre <dbl>,
## #   auto_post <dbl>, mortgage_pre <dbl>, mortgage_post <dbl>,
## #   studentloan_pre <dbl>, studentloan_post <dbl>, total_pre <dbl>,
## #   total_post <dbl>, creditcard_delinq_pre <dbl>, …
# export to file but I will # because we already have sync folder with github
# write.csv(merged_df, "data/data-clean/FINAL.csv", row.names = FALSE)

3 COVID Mapping Visualization

df<- read_csv("data/data-clean/FINAL.csv")

#df

3.1 COVID distribution

#ggplot(covidstates, aes(x = severity, fill = severity)) +
#  geom_bar() +
#  labs(title = "Number of States by COVID Severity Level",
#       x = "Severity Level", y = "Number of States") +
#  scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = #"red", "Very high" = "brown")) +
#  theme_minimal()

ggplot(covidstates, aes(x = severity, y = total_cases, fill = severity)) +
  geom_boxplot() +
  labs(title = "Distribution of Total Cases by Severity Level",
       x = "Severity", y = "Total Cases") +
  scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = "red", "Very high" = "brown")) + theme_minimal()

covidstates %>%
  ggplot(aes(x = year, fill = severity)) +
  geom_bar(position = "fill") +
  labs(title = "Distribution of COVID Severity by Year",
       y = "Proportion of States", x = "Year") +
  scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = "red", "Very high" = "brown")) +
  theme_minimal()

3.2 6 State

covidstates %>%
  filter(State %in% c("CA", "TX", "NY", "IL", "AZ", "FL")) %>%
  ggplot(aes(x = year, y = total_cases, color = State)) +
  geom_line(size = 1.2) +
  labs(title = "COVID Cases Over Time",
       x = "Year", y = "Total Cases") +
  theme_minimal()

3.3 Covid state map

state_abbreviations <- data.frame(
  abbreviation = state.abb, 
  full_name = state.name  
)


covidstatesFULLNAME <- covidstates %>%
  left_join(state_abbreviations, by = c("State" = "abbreviation")) %>%
  mutate(State = tolower(full_name))%>%
  select(-full_name)

# map visualization code
us_states <- map_data("state")
us_states$region <- tolower(us_states$region) 

# Merge the map data with the COVID data
covid_map_data <- us_states %>%
  left_join(covidstatesFULLNAME, by = c("region" = "State")) %>%
  filter(!is.na(severity))


ggplot(covid_map_data, aes(x = long, y = lat, group = group, fill = severity)) +
  geom_polygon(color = "black") +
  coord_fixed(1.1) +
  labs(title = "COVID Case Severity by State",
       fill = "Severity") +
  scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = "red", "Very high"= "brown")) +
  theme_void() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# covidstatesFULLNAME
# covidstates

4 Visuals of economic indicators

4.1 Population

4.1.1 Pre (2017–2019)

merged_dfLC <- merged_df %>%
  rename(state = State)

## PRE covid populations
plot_usmap(data = merged_dfLC, values = "population_pre", regions = "state") +
  scale_fill_continuous(
    name = "Population",
    low = "lightgreen", high = "darkgreen", label = scales::comma
  ) +
  labs(title = "Pre-COVID Population (2017–2019)") +
  theme(legend.position = "right")

4.1.2 Post (2020–2022)

## POST covid populations
plot_usmap(data = merged_dfLC, values = "population_post", regions = "state") +
  scale_fill_continuous(
    name = "Population",
    low = "lightblue", high = "darkblue", label = scales::comma
  ) +
  labs(title = "Post-COVID Population (2020–2022)") +
  theme(legend.position = "right")

4.1.3 Comparison

###  pre and post side by side
population_long <- merged_dfLC %>%
  select(state, population_pre, population_post) %>%
  pivot_longer(
    cols = c(population_pre, population_post),
    names_to = "period",
    values_to = "population"
  ) %>%
  mutate(
    period = recode(period,
                    "population_pre" = "Pre-COVID (2017–2019)",
                    "population_post" = "Post-COVID (2020–2022)"),
    period = factor(period, levels = c("Pre-COVID (2017–2019)", "Post-COVID (2020–2022)"))
  )

plot_usmap(data = population_long, values = "population", regions = "state") +
  facet_wrap(~ period) +
  scale_fill_continuous(
    name = "Population",
    low = "lightgreen", high = "darkgreen", label = scales::comma
  ) +
  labs(title = "Population: Pre vs. Post COVID") +
  theme(legend.position = "right")

4.2 GDP

# compare GDP
df %>% 
  select(State, gdp_pre, gdp_post) %>% 
  ggplot()+
  geom_bar(mapping = aes(x=reorder(State, -gdp_post), y=gdp_pre
                         , fill = "GDP pre = ((X2019 - X2017) / X2017) * 100")
           , stat = "identity"
           , position = position_nudge(x = -0.3)
           , width = 0.4)+
  geom_bar(mapping = aes(x=reorder(State, -gdp_post), y=gdp_post, fill ="GDP post = ((X2022 - X2020) / X2020) * 100")
           , stat = "identity"
           , position = position_nudge(x = 0.3)
           , width = 0.4)+
  labs(title = "Real GDP Growth: Pre vs Post COVID", y = "GDP Change (%)", x = "")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")

4.3 Income

df %>%
  select(State, income_pre, income_post) %>%
  ggplot() +
  geom_bar(mapping = aes(x = reorder(State, -income_post)
                         , y = income_pre
                         , fill = "Income pre = ((X2019 - X2017) / X2017) * 100")
           , stat = "identity"
           , position = position_nudge(x = -0.3)
           , width = 0.4) +
  geom_bar(mapping = aes(x = reorder(State, -income_post)
                         , y = income_post
                         , fill = "Income post = ((X2022 - X2020) / X2020) * 100")
           ,stat = "identity"
           , position = position_nudge(x = 0.3)
           , width = 0.4) +
  labs(title = "Personal Income Growth: Pre vs Post COVID", y = "Income Change (%)", x = "") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")

4.4 PCE

df %>%
  select(State, pce_pre, pce_post) %>%
  ggplot() +
  geom_bar(mapping = aes(x = reorder(State, -pce_post)
                         , y = pce_pre
                         , fill = "PCE pre = ((X2019 - X2017) / X2017) * 100")
           , stat = "identity"
           , position = position_nudge(x = -0.3)
           , width = 0.4) +
  geom_bar(mapping = aes(x = reorder(State, -pce_post)
                         , y = pce_post
                         , fill = "PCE post = ((X2022 - X2020) / X2020) * 100")
           , stat = "identity"
           , position = position_nudge(x = 0.3)
           , width = 0.4) +
  labs(title = "Personal Consumption Expenditures Growth: Pre vs Post COVID",
       y = "PCE Change (%)", x = "") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")

4.5 Unemployment Rate

df %>%
  select(State, unemp_pre, unemp_post) %>%
  ggplot() +
  geom_bar(aes(x = reorder(State, -unemp_post)
               , y = unemp_pre, 
               fill = "Unemp pre = 2019 - 2017")
           , stat = "identity"
           , position = position_nudge(x = -0.3)
           , width = 0.4) +
  geom_bar(aes(x = reorder(State, -unemp_post)
               , y = unemp_post
               , fill = "Unemp post = (2022 - 2020)")
           , stat = "identity"
           , position = position_nudge(x = 0.3)
           , width = 0.4) +
  labs(title = "Unemployment Rate Change: 2017–2019 vs 2020–2022", y = "Change in Unemployment Rate", x = "") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),legend.position = "bottom")

df_emp <- df %>%
  select(State, unemp_17, unemp_19, unemp_20, unemp_22)

#df_emp

df_long <- df_emp %>% 
  pivot_longer(cols = starts_with("unemp_"), names_to = "Year", values_to = "Unemployment") %>%
  mutate(Year = recode(Year,
                       "unemp_17" = "2017",
                       "unemp_19" = "2019",
                       "unemp_20" = "2020",
                       "unemp_22" = "2022"))

#df_long 

ggplot(df_long, aes(x = Year, y = Unemployment, group = State, color = State)) +
  geom_line(size = 1) +
  labs(title = "Unemployment Rate by State", y = "Unemployment Rate (%)", x = "Year") +
  theme(legend.position = "none")

5 Debt and delinquency Visuals

5.1 creditcard

5.1.1 Fomula

Variable Name Formula or Definition Description
creditcard_pre ((Q4_2019 - Q4_2017) / Q4_2017) × 100 Credit card debt growth rate (Pre-COVID)
creditcard_post ((Q4_2022 - Q4_2020) / Q4_2020) × 100 Credit card debt growth rate (Post-COVID)
creditcard_delinq_pre Q4_2019 Credit card delinquency rate (Pre-COVID)
creditcard_delinq_post Q4_2022 Credit card delinquency rate (Post-COVID)

5.1.2 Visual 1

credit_df <- df %>% 
  select(State, creditcard_pre, creditcard_post) %>% 
  pivot_longer(cols = c(creditcard_pre, creditcard_post),
               names_to = "Time", values_to = "Change") %>% 
  mutate(Time = factor(Time, levels = c("creditcard_pre", "creditcard_post")))

## credit_df

ggplot(credit_df, aes(x = State, y = Change, fill = Time)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7), width = 0.9) +  
  labs(title = "Change Rate in Credit Card Dept",
       y = "Credit Card Dept Change Rate (%)", x = "")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
        legend.position = "bottom")

5.1.3 Visual 2

credit_delinq_df <- df %>% 
  select(State, creditcard_delinq_pre, creditcard_delinq_post) %>% 
  pivot_longer(cols = c(creditcard_delinq_pre, creditcard_delinq_post),
               names_to = "Time", values_to = "Change") %>% 
  mutate(Time = factor(Time, levels = c("creditcard_delinq_pre", "creditcard_delinq_post")))

#credit_delinq_df

gg_debt <- ggplot(credit_df, aes(x = Time, y = Change, fill = Time)) +
  geom_violin(trim = FALSE, alpha = 0.5) +
  labs(title = "Credit Card Debt Change (%)", y = "Change Rate (%)", x = "")+
  theme(legend.position = "bottom")

#gg_debt 

gg_delinq <- ggplot(credit_delinq_df, aes(x = Time, y = Change, fill = Time)) +
  geom_violin(trim = FALSE, alpha = 0.5) +
  labs(title = "Credit Card Delinquency Rate (%)", y = "Delinquency Rate (%)", x = "")+
  theme(legend.position = "bottom")

#gg_delinq

grid.arrange(gg_debt, gg_delinq, ncol = 2)

5.2 Mortgage

#Percent Change in Mortgage Debt Balance per Capita (2017 → 2019)
qq1 <- ggplot(df, aes(x = "", y = mortgage_pre)) +
  geom_boxplot(fill = "pink") +
  labs(title = "Mortgage Debt per Capital -Pre", y = "Mortgage Dept Change Rate (%)")

# Percent Change in Mortgage Debt Balance per Capita (2020 → 2022)
qq2 <- ggplot(df, aes(x = "", y = mortgage_post)) +
  geom_boxplot(fill = "skyblue") +
  labs(title = "Mortgage Debt per Capital - Post", y = "Mortgage Dept Change Rate (%)")

qq3 <- ggplot(df, aes(x = "", y = mortgage_delinq_pre)) +
  geom_boxplot(fill = "orange") +
  labs(title = "Mortgage Delinquency - Pre", y = "Delinquency Rate (%)")

qq4 <- ggplot(df, aes(x = "", y = mortgage_delinq_post)) +
  geom_boxplot(fill = "purple") +
  labs(title = "Mortgage Delinquency - Post", y = "Delinquency Rate (%)")

grid.arrange(qq1, qq2, qq3, qq4, ncol = 4)

5.3 Student Loan

df %>%
  select(State, studentloan_delinq_pre, studentloan_delinq_post) %>%
  ggplot() +
  geom_bar(aes(x = reorder(State, -studentloan_delinq_post)
               , y = studentloan_delinq_pre
               ,fill = "Student Loan pre = Q4_2019")
           , stat = "identity"
           , position = position_nudge(x = -0.3)
           , width = 0.4) +
  geom_bar(aes(x = reorder(State, -studentloan_delinq_post)
               , y = studentloan_delinq_post
               , fill = "Student Loan post = Q4_2022")
           , stat = "identity"
           , position = position_nudge(x = 0.3)
           , width = 0.4) +
  labs(title = "Student Loan Delinquency: Pre vs Post COVID", y = "Delinquency Rate (%)", x = "") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),legend.position = "bottom")

student_df <- df %>% 
  select(State, studentloan_pre, studentloan_post) %>% 
  pivot_longer(cols = c(studentloan_pre, studentloan_post), names_to = "Time", values_to = "Change") %>%
  mutate(Time= ifelse(Time == "studentloan_pre", "Pre", "Post"))

#student_df

student_df$Time <- factor(student_df$Time, levels = c("Pre", "Post"))

gg_student <- ggplot(student_df, aes(x = Time, y = Change, fill = Time)) +
  geom_violin(trim = FALSE, alpha = 0.4) +
  labs(title = "Student Loan Debt Change (%)", y = "Change Rate (%)", x = "") +
  scale_fill_manual(values = c( "Pre" = "orange","Post" = "purple"))

#gg_student

#names(df)

student_df2 <- df %>% 
  select(State, studentloan_delinq_pre, studentloan_delinq_post) %>% 
  pivot_longer(cols = c(studentloan_delinq_pre, studentloan_delinq_post), names_to = "Time", values_to = "Change") %>%
  mutate(Time= ifelse(Time == "studentloan_delinq_pre", "Pre", "Post"))

#student_df2

student_df2$Time <- factor(student_df2$Time, levels = c("Pre", "Post"))

gg_student2 <- ggplot(student_df2, aes(x = Time, y = Change, fill = Time)) +
  geom_violin(trim = FALSE, alpha = 0.4) +
  labs(title = "Student Loan Debt Deliq Change (%)", y = "Change Rate (%)", x = "") +
  scale_fill_manual(values = c( "Pre" = "yellow", "Post" = "red"))

#gg_student2

grid.arrange(gg_student, gg_student2, ncol = 2)

5.4 Auto

df_auto <- df %>%
  select(auto_pre, auto_post, auto_delinq_pre, auto_delinq_post) %>%
  pivot_longer(cols = everything(), names_to = "Category", values_to = "Value") %>% 
  mutate(Category = factor(Category, levels = c("auto_pre", "auto_post", "auto_delinq_pre", "auto_delinq_post")))


ggplot(df_auto, aes(x = Category, y = Value, fill = Category)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Boxplots of Auto Loan and Delinquency (Pre vs Post)",
       x = NULL,
       y = "Value") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1), legend.position = "bottom")

6 Housing Changing Rate Visualizations

6.1 Pre(2017–2019)

merged_dfLC <- merged_df %>%
  rename(state = State)

## heat map for PRE covid housing change rate
plot_usmap(data = merged_dfLC, values = "zhvi_pre_change", regions = "state") +
  scale_fill_continuous(
    name = "Zillow Home Value Change (%)",
    low = "lightyellow", high = "red", label = scales::comma
  ) +
  labs(title = "Pre-COVID Housing Price Change (2017–2019)") +
  theme(legend.position = "right")

6.2 Post(2020–2022)

## heat map for POST covid housing change rate
plot_usmap(data = merged_dfLC, values = "zhvi_post_change", regions = "state") +
  scale_fill_continuous(
    name = "Zillow Home Value Change (%)",
    low = "lightblue", high = "darkblue", label = scales::comma
  ) +
  labs(title = "Post-COVID Housing Price Change (2020–2022)") +
  theme(legend.position = "right")

6.3 Comparison

### both pre and post, side by side

housing_long <- merged_dfLC %>%
  select(state, zhvi_pre_change, zhvi_post_change) %>%
  pivot_longer(
    cols = c(zhvi_pre_change, zhvi_post_change),
    names_to = "period",
    values_to = "change"
  ) %>%
  mutate(
    period = recode(period,
                    "zhvi_pre_change" = "Pre-COVID (2017–2019)",
                    "zhvi_post_change" = "Post-COVID (2020–2022)")) %>% 
  mutate(period = factor(period, levels = c("Pre-COVID (2017–2019)", "Post-COVID (2020–2022)")))



plot_usmap(data = housing_long, values = "change", regions = "state") +
  facet_wrap(~ period) +
  scale_fill_continuous(
    name = "ZHVI Change (%)",
    low = "lightyellow", high = "red", label = scales::comma
  ) +
  labs(title = "Housing Price Change: Pre vs. Post COVID") +
  theme(legend.position = "right")